Reads
My analyses are based on this course on the Hemberg Lab scRNAseq course
notes:
HE = hemogenic endothelium
Ncx1-/- lack heartbeat
signalling pathways downstream of circulation include Notch
To assess Notch signaling in Ncx1-/- HE and pro-HSCs, we performed bulk and single cell qRT-PCR,
along with markers for endothelial and hematopoietic cells, and identified an aberrant gene expression
signature in pro-HSCs and HE, with upregulation of endothelial genes and downregulation of hematopoietic genes.
This is in line with the hypothesis of a block in hematopoietic development.
Additionally, Jag-1, but not Dll4-mediated Notch signalling appeared to be decreased in Ncx1-/- HE and pro-HSC.
Not only HE and pro-HSCs were affected in the absence of circulation: also aortic smooth muscle cells were reduced,
indicating a change in the cellular composition and potentially signaling environment of the AGM.
library(dplyr)
library(scater)
library(stringr)
library(singleCellTK)
library(biomaRt)
library(stringr)
library(tibble)
library(ggrepel)
reads <- read.table('myriad/temp2', header = TRUE, stringsAsFactors = F)
anno <- read.table('EA2_index-sample_key.csv', header = TRUE, stringsAsFactors = FALSE, sep = ",")
reads <- SingleCellExperiment(assays = list(counts = as.matrix(reads)),
colData = anno)
keep_feature <- rowSums(counts(reads) > 0) > 0
reads <- reads[keep_feature, ]
knitr::kable(summarizeTable(reads))
mouse_mart=useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
attr.string = c('ensembl_gene_id', 'external_gene_name', 'chromosome_name')
attr.string = c(attr.string, 'start_position', 'end_position', 'strand')
attr.string = c(attr.string, 'description', 'percentage_gene_gc_content')
attr.string = c(attr.string, 'gene_biotype')
gene.annotation = getBM(attributes=attr.string,
filters = 'ensembl_gene_id',
values = rownames(reads),
mart = mouse_mart)
## some genes do not have annotation because their ids are retired
gene.missing <- setdiff(rownames(reads), gene.annotation$ensembl_gene_id)
genes2keep <- match(gene.annotation$ensembl_gene_id, rownames(reads))
reads <- reads[genes2keep,]
rowData(reads) <- gene.annotation
rownames(reads) <- uniquifyFeatureNames(rowData(reads)$ensembl_gene_id, rowData(reads)$external_gene_name)
is.mito = which(rowData(reads)$chromosome_name == "MT")
reads <- calculateQCMetrics(
reads,
feature_controls = list(
MT = is.mito)
)
##Counts
library(dplyr)
library(scater)
library(stringr)
library(singleCellTK)
library(biomaRt)
library(stringr)
library(tibble)
library(ggrepel)
reads <- readRDS('reads_ea2.rds')
hist(
reads$total_counts,
breaks = 100
)
abline(v = 5.2e5, col = "red")

filter_by_total_counts <- (reads$total_counts > 5.2e5 )
##number of features
hist(
reads$total_features_by_counts_endogenous,
breaks = 100
)
abline(v = 4600, col = "red")

filter_by_expr_features <- (reads$total_features_by_counts_endogenous > 4600)
percent Mitochondrial
hist(reads$pct_counts_MT, breaks = 80)

filter_by_MT <- reads$pct_counts_MT < 10
summary
reads$use <- (
# sufficient features (genes)
filter_by_expr_features &
# mitochondrial reads
filter_by_MT &
# by number of counts
filter_by_total_counts
)
# remove failed
reads <- reads[,reads$use]
knitr::kable(summarizeTable(reads))
| Number of Samples |
371 |
| Number of Genes |
32173 |
| Average number of reads per cell |
719331 |
| Average number of genes per cell |
8076 |
| Samples with <1700 detected genes |
0 |
| Genes with no expression across all samples |
651 |
PCA pre normalization
anno <- read.table('EA2_index-sample_key.csv', header = TRUE, stringsAsFactors = FALSE, sep = ",")
assay(reads, "logcounts") <- log2(counts(reads) + 1)
my_pca <- prcomp(t(logcounts(reads)))
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]
my_data <- data.frame(my_data) %>% rownames_to_column("pca_rows") %>% dplyr::mutate("pca_rows" = unlist(purrr::map(pca_rows, str_replace, "_N7.+","" )))
my_data <- left_join(my_data,anno, by = c("pca_rows" = "X384_EA2_Nextera.XT_address"))
my_data$geno <- unlist(purrr::map(str_split(my_data$population.name, " "), 1))
my_data$pheno <- unlist(purrr::map(str_split(my_data$population.name, " "), 3))
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
labs(x = x.lab, y = y.lab) +
scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA pre-normalisation")

Screes
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per[1:20], main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

TSNE pre normalisation
my_tsnePlot <- scater::plotTSNE(reads, colour_by = "population.name")
my_tsnePlot$data$genotype<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 1))
my_tsnePlot$data$pheno<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 3))
my_data <- my_tsnePlot$data
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = X, y = Y, shape = pheno, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
scale_colour_manual(values = cbbPalette ) +
ggtitle(label = "TSNE pre-normalisation")

#saveRDS(reads.use, "reads.rds")
PCA post normalisation
library(scran)
reads <- computeSumFactors(reads)
reads <- scater::normalize(reads)
saveRDS(reads, 'reads_qc.rds')
my_pca<-prcomp(t(logcounts(reads)))
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]
my_data <- data.frame(my_data) %>% rownames_to_column("pca_rows") %>% dplyr::mutate("pca_rows" = unlist(purrr::map(pca_rows, str_replace, "_N7.+","" )))
my_data <- left_join(my_data,anno, by = c("pca_rows" = "X384_EA2_Nextera.XT_address"))
my_data$geno <- unlist(purrr::map(str_split(my_data$population.name, " "), 1))
my_data$pheno <- unlist(purrr::map(str_split(my_data$population.name, " "), 3))
my_pca_data <- my_data
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
labs(x = x.lab, y = y.lab) +
scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA post-normalisation")

###Screes
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per[1:20], main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

TSNE Post normalisation
# my_tsnePlot <- scater::plotTSNE(reads, colour_by = "population.name")
# saveRDS(my_tsnePlot,"my_tsnePlot.rds")
my_tsnePlot <- readRDS("my_tsnePlot.rds")
my_tsnePlot$data$genotype<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 1))
my_tsnePlot$data$pheno<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 3))
my_data <- my_tsnePlot$data
my_data <- my_data %>% rownames_to_column(var = "sample_id") %>% tbl_df()
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = X, y = Y, shape = pheno, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
scale_colour_manual(values = cbbPalette ) +
ggtitle(label = "TSNE post-normalisation")

PCA Loadings post normalization
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca_data
# extract Loadings
# Extract loadings of the variables
loading_scores_pc1 <- my_pca$rotation[,1]
gene_scores <- abs(loading_scores_pc1)
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes_PC1 <- names(gene_score_ranked[1:10])
loading_scores_pc2 <- my_pca$rotation[,2]
gene_scores <- abs(loading_scores_pc2)
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes_PC2 <- names(gene_score_ranked[1:10])
top_20_genes <- c(top_10_genes_PC1,top_10_genes_PC2)
pca_loadings<-data.frame(my_pca$rotation[which(rownames(my_pca$rotation) %in% top_20_genes),])[,1:2]
P <- ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3, alpha =0.6 ) +
theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
labs(x = x.lab, y = y.lab) +
scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA post-normalisation")
P + geom_segment(inherit.aes=FALSE,data = pca_loadings, aes(x = 0, y = 0, xend = (PC1),
yend = (PC2)), arrow = arrow(length = unit(1/2, "picas")),
color = "purple") + geom_label_repel(inherit.aes = FALSE, data=pca_loadings, aes(x =(PC1), y= (PC2), label = rownames(pca_loadings)))

##DESeq2 PCA with VST
library(DESeq2)
anno <- colData(reads)[,1:3]
anno$genotype <- unlist(purrr::map(str_split(anno$population.name, " "), 1))
anno$phenotype <- unlist(purrr::map(str_split(anno$population.name, " "), 3))
dds <- DESeqDataSetFromMatrix(countData =counts(reads),colData = anno, design = ~genotype )
rld <- vst(dds)
my_pca <- prcomp(t(assay(rld)))
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]
my_data<-left_join(rownames_to_column(data.frame(my_data),"pca_rows"),rownames_to_column(data.frame(anno),"sample_id"), by = c("pca_rows" = "sample_id"))
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = phenotype, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray94",
size = 0.35), panel.grid.minor = element_line(colour = "gray98",
size = 0.1), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill = NA)) +
labs(colour = "genotype") + labs(shape = "phenotype") +
labs(x = x.lab, y = y.lab) +
scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA DESeq2 VST-normalisation")

Mean count per gene
counts <- counts(reads)
nGenes <- length(counts[,1])
coverage <- colSums(counts)/nGenes
reads$genotype<-unlist(purrr::map(str_split(reads$population.name, " "), 1))
cell.labels <- reads$genotype
colnames(counts) <- cell.labels
cell.labels <- as.factor(cell.labels)
cov_df<-data.frame(coverage)
cov_df$names<-cell.labels
ggplot(cov_df, aes(x=names,y=coverage)) + geom_violin() + geom_jitter() + theme(axis.line = element_line(size = 0.5,
linetype = "solid"), panel.grid.major = element_line(colour = "gray90"),
panel.grid.minor = element_line(colour = "gray94",
linetype = "dashed"), panel.background = element_rect(fill = NA))

playing with histograms
library(tidyverse)
my_goi<-c("Dll4","Efnb2","Erg","Gata2","Gfi1","Hes1","Hey1","Hey2","Itga2b","Jag1","Jag2","Kdr","Klf2","Lmo2","Myb","Notch1","Notch4","Runx1","Tal1","Tek","Gapdh","Ubc")
mcols(reads)$strand<-NULL
my_reads<-reads[which(rownames(reads) %in% my_goi),]
my_df<-data.frame(logcounts(my_reads))
colnames(my_df) <- str_replace(my_reads$population.name," #[0-9]+ " ,"_")
my_df<-my_df %>% rownames_to_column('gene')
my_df<- gather(my_df,condition,count,-gene)
my_df$condition<-str_replace(my_df$condition,"\\.[0-9]+","")
base <- ggplot(my_df,aes(count, fill=condition)) + geom_histogram(aes(y = ..count..),bins = 50) + scale_y_continuous(trans="log1p", breaks = NULL)
base + facet_wrap(~gene) + theme_bw() + xlab(label = "normalised log count") + ylab(label="cell count")

variation explained
reads$condition_ID_main <- str_replace(my_reads$population.name," #[0-9]+ " ,"_")
reads$genotype <- unlist(purrr::map(str_split(reads$population.name, " "), 1))
reads$phenotype <- unlist(purrr::map(str_split(reads$population.name, " "), 3))
my_vars<-getVarianceExplained(reads, exprs_values = "logcounts", variables = c("population.name", "condition_ID_main","genotype","phenotype"))
plotExplanatoryVariables(my_vars)

---
title: "EA2 NextSeq Prelim"
author: <a href="https://chela-james.github.io/"> <h3> Chela James </h3> </a> \newline Cancer Institute, UCL, UK
date: 23 April 2019
output:
  bookdown::html_document2:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: united
    highlight: tango
    df_print: paged
    code_folding: hide
    code_download: true
    self_contained: true
    keep_md: false
    encoding: "UTF-8"
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
htmltools::tagList(rmarkdown::html_dependency_font_awesome())
knitr::opts_chunk$set(eval = TRUE, cache = FALSE, warning = FALSE, message = FALSE)
```

## Reads

[My analyses are based on this course on the Hemberg Lab scRNAseq course
](https://hemberg-lab.github.io/scRNA.seq.course/cleaning-the-expression-matrix.html#expression-qc-reads)

notes:
HE  = hemogenic endothelium 
Ncx1-/- lack heartbeat
signalling pathways downstream of circulation include Notch

To assess Notch signaling in Ncx1-/-  HE and pro-HSCs, we performed bulk and single cell qRT-PCR, 
along with markers for endothelial and hematopoietic cells, and identified an aberrant gene expression 
signature in pro-HSCs and HE, with upregulation of endothelial genes and downregulation of hematopoietic genes. 
This is in line with the hypothesis of a block in hematopoietic development. 
Additionally, Jag-1, but not Dll4-mediated Notch signalling appeared to be decreased in Ncx1-/- HE and pro-HSC. 
Not only HE and pro-HSCs were affected in the absence of circulation: also aortic smooth muscle cells were reduced, 
indicating a change in the cellular composition and potentially signaling environment of the AGM. 

```{r message=FALSE, eval = F}
library(dplyr)
library(scater)
library(stringr)
library(singleCellTK)
library(biomaRt)
library(stringr)
library(tibble)
library(ggrepel)
reads <- read.table('myriad/temp2', header = TRUE, stringsAsFactors = F)
anno <- read.table('EA2_index-sample_key.csv', header = TRUE, stringsAsFactors = FALSE, sep = ",")

reads <- SingleCellExperiment(assays = list(counts = as.matrix(reads)),
                              colData = anno)

keep_feature <- rowSums(counts(reads) > 0) > 0
reads <- reads[keep_feature, ]
knitr::kable(summarizeTable(reads))

mouse_mart=useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")

attr.string = c('ensembl_gene_id', 'external_gene_name', 'chromosome_name')
attr.string = c(attr.string, 'start_position', 'end_position', 'strand')
attr.string = c(attr.string, 'description', 'percentage_gene_gc_content')
attr.string = c(attr.string, 'gene_biotype')


gene.annotation = getBM(attributes=attr.string, 
                        filters =  'ensembl_gene_id', 
                        values = rownames(reads), 
                        mart = mouse_mart)

## some genes do not have annotation because their ids are retired

gene.missing <- setdiff(rownames(reads), gene.annotation$ensembl_gene_id)


genes2keep <- match(gene.annotation$ensembl_gene_id, rownames(reads))
reads <- reads[genes2keep,]
rowData(reads) <- gene.annotation
rownames(reads) <- uniquifyFeatureNames(rowData(reads)$ensembl_gene_id, rowData(reads)$external_gene_name)

is.mito = which(rowData(reads)$chromosome_name == "MT")
reads <- calculateQCMetrics(
    reads,
    feature_controls = list(
        MT =  is.mito)
    )

```

##Counts

```{r, message=F,warning=F}
library(dplyr)
library(scater)
library(stringr)
library(singleCellTK)
library(biomaRt)
library(stringr)
library(tibble)
library(ggrepel)
reads <- readRDS('reads_ea2.rds')
hist(
    reads$total_counts,
    breaks = 100
)
abline(v = 5.2e5, col = "red")

filter_by_total_counts <- (reads$total_counts > 5.2e5 )
```

##number of features

```{r}
hist(
    reads$total_features_by_counts_endogenous,
    breaks = 100
)
abline(v = 4600, col = "red")

filter_by_expr_features <- (reads$total_features_by_counts_endogenous > 4600)

```

## percent Mitochondrial

```{r}
hist(reads$pct_counts_MT, breaks = 80)
filter_by_MT <- reads$pct_counts_MT < 10
```

## summary

```{r}
reads$use <- (
    # sufficient features (genes)
    filter_by_expr_features &
    # mitochondrial reads
    filter_by_MT &
    # by number of counts
    filter_by_total_counts
)

# remove failed 
reads <- reads[,reads$use]

knitr::kable(summarizeTable(reads))

```

## PCA pre normalization

```{r, message = FALSE, warning = FALSE}

anno <- read.table('EA2_index-sample_key.csv', header = TRUE, stringsAsFactors = FALSE, sep = ",")
assay(reads, "logcounts") <- log2(counts(reads) + 1)

my_pca <- prcomp(t(logcounts(reads)))

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]
my_data <- data.frame(my_data) %>% rownames_to_column("pca_rows") %>% dplyr::mutate("pca_rows" = unlist(purrr::map(pca_rows, str_replace, "_N7.+","" )))

my_data <- left_join(my_data,anno, by = c("pca_rows" = "X384_EA2_Nextera.XT_address"))

my_data$geno <- unlist(purrr::map(str_split(my_data$population.name, " "), 1))
my_data$pheno <- unlist(purrr::map(str_split(my_data$population.name, " "), 3))

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    labs(x = x.lab, y = y.lab) +
    scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA pre-normalisation")
```

### Screes

```{r}
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
 
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
```


```{r}
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
 
barplot(pca.var.per[1:20], main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
```

## TSNE pre normalisation

```{r}
my_tsnePlot <- scater::plotTSNE(reads, colour_by = "population.name")

my_tsnePlot$data$genotype<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 1))
my_tsnePlot$data$pheno<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 3))

my_data <- my_tsnePlot$data

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = X, y = Y, shape = pheno, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    scale_colour_manual(values = cbbPalette ) +
    ggtitle(label = "TSNE pre-normalisation")

#saveRDS(reads.use, "reads.rds")
```

## PCA post normalisation

```{r, message = F, warning=F}
library(scran)

reads <- computeSumFactors(reads)
reads <- scater::normalize(reads)
saveRDS(reads, 'reads_qc.rds')
my_pca<-prcomp(t(logcounts(reads)))

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]
my_data <- data.frame(my_data) %>% rownames_to_column("pca_rows") %>% dplyr::mutate("pca_rows" = unlist(purrr::map(pca_rows, str_replace, "_N7.+","" )))

my_data <- left_join(my_data,anno, by = c("pca_rows" = "X384_EA2_Nextera.XT_address"))

my_data$geno <- unlist(purrr::map(str_split(my_data$population.name, " "), 1))
my_data$pheno <- unlist(purrr::map(str_split(my_data$population.name, " "), 3))

my_pca_data <- my_data
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    labs(x = x.lab, y = y.lab) +
    scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA post-normalisation")
```

###Screes

```{r}
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
 
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

```

```{r}
pca.var <- my_pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per[1:20], main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
```

## TSNE  Post normalisation

```{r}

# my_tsnePlot <- scater::plotTSNE(reads, colour_by = "population.name")
# saveRDS(my_tsnePlot,"my_tsnePlot.rds")
my_tsnePlot <- readRDS("my_tsnePlot.rds")
my_tsnePlot$data$genotype<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 1))
my_tsnePlot$data$pheno<-unlist(purrr::map(str_split(my_tsnePlot$data$colour_by, " "), 3))

my_data <- my_tsnePlot$data
my_data <- my_data %>% rownames_to_column(var = "sample_id") %>% tbl_df()

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = X, y = Y, shape = pheno, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    scale_colour_manual(values = cbbPalette ) +
    ggtitle(label = "TSNE post-normalisation")



```


## PCA Loadings post normalization

```{r}
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))

my_data <- my_pca_data
# extract Loadings
# Extract loadings of the variables
loading_scores_pc1 <- my_pca$rotation[,1]
gene_scores <- abs(loading_scores_pc1)
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes_PC1 <- names(gene_score_ranked[1:10])

loading_scores_pc2 <- my_pca$rotation[,2]
gene_scores <- abs(loading_scores_pc2)
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes_PC2 <- names(gene_score_ranked[1:10])

top_20_genes <- c(top_10_genes_PC1,top_10_genes_PC2)
pca_loadings<-data.frame(my_pca$rotation[which(rownames(my_pca$rotation) %in% top_20_genes),])[,1:2]

P <- ggplot(my_data,aes(x = PC1, y = PC2, shape = pheno, colour = geno)) + geom_point(size = 3, alpha =0.6 ) +
    theme(axis.line =   element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    labs(x = x.lab, y = y.lab) +
    scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA post-normalisation")

P + geom_segment(inherit.aes=FALSE,data = pca_loadings, aes(x = 0, y = 0, xend = (PC1),
     yend = (PC2)), arrow = arrow(length = unit(1/2, "picas")),
     color = "purple") + geom_label_repel(inherit.aes = FALSE, data=pca_loadings, aes(x =(PC1), y= (PC2), label = rownames(pca_loadings)))


```

##DESeq2 PCA with VST

```{r , message = F, warning=F}
library(DESeq2)
anno <- colData(reads)[,1:3]
anno$genotype <- unlist(purrr::map(str_split(anno$population.name, " "), 1))
anno$phenotype <- unlist(purrr::map(str_split(anno$population.name, " "), 3))
dds  <- DESeqDataSetFromMatrix(countData =counts(reads),colData = anno, design = ~genotype )
rld  <- vst(dds)

my_pca <- prcomp(t(assay(rld)))

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == x)]^2/sum(my_pca$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * my_pca$sdev[which(colnames(my_pca$x) == y)]^2/sum(my_pca$sdev^2)))
my_data <- my_pca$x[,1:2]


my_data<-left_join(rownames_to_column(data.frame(my_data),"pca_rows"),rownames_to_column(data.frame(anno),"sample_id"), by = c("pca_rows" = "sample_id"))


cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot(my_data,aes(x = PC1, y = PC2, shape = phenotype, colour = genotype)) + geom_point(size = 3) + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray94", 
    size = 0.35), panel.grid.minor = element_line(colour = "gray98", 
    size = 0.1), panel.background = element_rect(fill = NA), 
    legend.key = element_rect(fill = NA), 
    legend.background = element_rect(fill = NA)) + 
    labs(colour = "genotype") + labs(shape = "phenotype") +
    labs(x = x.lab, y = y.lab) +
    scale_colour_manual(values = cbbPalette ) + ggtitle(label = "PCA DESeq2 VST-normalisation")

```

## Mean count per gene

```{r}
counts <- counts(reads)
nGenes <- length(counts[,1])

coverage <- colSums(counts)/nGenes

reads$genotype<-unlist(purrr::map(str_split(reads$population.name, " "), 1))

cell.labels <- reads$genotype
colnames(counts) <- cell.labels
cell.labels <- as.factor(cell.labels)
cov_df<-data.frame(coverage)
cov_df$names<-cell.labels

ggplot(cov_df, aes(x=names,y=coverage)) + geom_violin() + geom_jitter() + theme(axis.line = element_line(size = 0.5, 
    linetype = "solid"), panel.grid.major = element_line(colour = "gray90"), 
    panel.grid.minor = element_line(colour = "gray94", 
        linetype = "dashed"), panel.background = element_rect(fill = NA))

```

## playing with histograms

```{r, warning=F,message=F}
library(tidyverse)
my_goi<-c("Dll4","Efnb2","Erg","Gata2","Gfi1","Hes1","Hey1","Hey2","Itga2b","Jag1","Jag2","Kdr","Klf2","Lmo2","Myb","Notch1","Notch4","Runx1","Tal1","Tek","Gapdh","Ubc")
mcols(reads)$strand<-NULL
my_reads<-reads[which(rownames(reads) %in% my_goi),]
my_df<-data.frame(logcounts(my_reads))
colnames(my_df) <- str_replace(my_reads$population.name," #[0-9]+ " ,"_")
my_df<-my_df %>% rownames_to_column('gene')
my_df<- gather(my_df,condition,count,-gene)
my_df$condition<-str_replace(my_df$condition,"\\.[0-9]+","")
base <- ggplot(my_df,aes(count, fill=condition)) + geom_histogram(aes(y = ..count..),bins = 50) + scale_y_continuous(trans="log1p", breaks = NULL)
base + facet_wrap(~gene)  + theme_bw() + xlab(label = "normalised log count") + ylab(label="cell count")


```

## variation explained


```{r}
reads$condition_ID_main <- str_replace(my_reads$population.name," #[0-9]+ " ,"_")
reads$genotype <- unlist(purrr::map(str_split(reads$population.name, " "), 1))
reads$phenotype <- unlist(purrr::map(str_split(reads$population.name, " "), 3))
my_vars<-getVarianceExplained(reads, exprs_values = "logcounts", variables =  c("population.name", "condition_ID_main","genotype","phenotype"))
plotExplanatoryVariables(my_vars)

```


